The latest version on different platforms can be installed following instructions at http://bioconductor.org/install/#install-R.
Install packages used:
source("http://bioconductor.org/biocLite.R")
biocLite(c("supraHex","devtools","XGR","dplyr"))
devtools::install_github(c("hfang-bristol/supraHex", "hfang-bristol/XGR"))
A collection of aesthetic maps supported for the self-organised representation and comparison: a supra-hexagonal map, a ring-shaped map, a diamond map, a trefoil map, a butterfly map, an hourglass map, a ladder map, a bridge map, and a triangle-shaped map.
library(supraHex)
colormap <- 'lightyellow-lightblue'
color <- 'black'
# For "supraHex" shape itself
sHex <- sHexGridVariant(r=6, shape="suprahex")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_suprahex <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "triangle" shape
sHex <- sHexGridVariant(r=6, shape="triangle")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_triangle <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "diamond" shape
sHex <- sHexGridVariant(r=6, shape="diamond")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_diamond <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "hourglass" shape
sHex <- sHexGridVariant(r=6, shape="hourglass")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_hourglass <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "trefoil" shape
sHex <- sHexGridVariant(r=6, shape="trefoil")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_trefoil <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "ladder" shape
sHex <- sHexGridVariant(r=6, shape="ladder")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ladder <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "butterfly" shape
sHex <- sHexGridVariant(r=6, shape="butterfly")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_butterfly <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "ring" shape
sHex <- sHexGridVariant(r=6, shape="ring")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ring <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "bridge" shape
sHex <- sHexGridVariant(r=6, shape="bridge")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_bridge <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
gridExtra::grid.arrange(grobs=list(gp_suprahex,gp_ring,gp_diamond,gp_trefoil,gp_butterfly,gp_hourglass,gp_ladder,gp_bridge,gp_triangle), layout_matrix=rbind(c(1,1,2,2,3),c(1,1,2,2,3),c(4,4,5,5,6),c(4,4,5,5,6),c(7,7,8,8,9)), nrow=5, ncol=5)
In this showcase, we analyse TF and DHS datasets in K562, and compare TF datasets between K562 and Gm12878.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF and DHS datasets:
# extract TFBSs, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
## common to Gm12878 and K562
anno_gr_Gm12878 <- ENCODE_TFBS_ClusteredV3_CellTypes$GM12878
anno_gr_K562 <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
names_common <- sort(intersect(names(anno_gr_Gm12878), names(anno_gr_K562)))
ind <- match(names_common, names(anno_gr_Gm12878))
TF_gr_Gm12878 <- anno_gr_Gm12878[ind]
ind <- match(names_common, names(anno_gr_K562))
TF_gr_K562 <- anno_gr_K562[ind]
# extract DHS, stored in a list of GR objects
ENCODE_DNaseI_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_DNaseI_ClusteredV3_CellTypes', RData.location=RData.location)
## in K562
ind <- grep("K562", names(ENCODE_DNaseI_ClusteredV3_CellTypes))
DHS_gr_K562 <- ENCODE_DNaseI_ClusteredV3_CellTypes[ind]
Calculate gene-centric TF association scores:
anno_gr <- TF_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_K562 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
Justify the use of the bridge-shaped map:
data <- G2TF_K562
df <- stats::prcomp(data)$x[,1:2]
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + geom_point(size=1,col='transparent') + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=48) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the bridge-shaped map using cross-TF gene scores:
data <- G2TF_K562
sMap <- sPipeline(data, shape="bridge", finetuneSustain=T)
Build neighbour-joining TF tree:
D <- t(sMap$codebook)
set.seed(825)
tree_bs <- visTreeBootstrap(D, num.bootstrap=1000, nodelabels.arg=list(frame="circle",cex=0.35,col="black"), plot.phylo.arg=list(type=c("phylogram","cladogram","fan","radial")[1],cex=0.6,node.pos=1))
Visualise TF map (reordered by tree):
tree_bs_ind <- match(tree_bs$tip.label, rownames(D))
xMap <- sMap
xMap$codebook <- xMap$codebook[,tree_bs_ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=51:1, rect.grid=c(7,8), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Calculate gene-centric DHS association scores:
anno_gr <- DHS_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=0, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, Cell=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2DHS <- as.matrix(xSparseMatrix(df[,c("Gene","Cell","Score")]))
The DHS map is obtained by overlaying the DHS data onto the trained TF map in K562:
ind <- match(rownames(G2TF_K562), rownames(G2DHS))
additional <- G2DHS[ind,]
additional[is.na(additional)] <- 0
additional <- matrix(additional, ncol=1)
rownames(additional) <- rownames(G2TF_K562)
colnames(additional) <- 'DHS'
sOverlay_DHS <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)
Visualise DHS map:
colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_DHS, rect.grid=c(1,1), title.rotate=0, colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=1), newpage=F)
Rankings of 51 TFs in terms of similarity (Pearson correlation) to the DHS map in K562:
M_K562 <- sMap$codebook
M_DHS <- sOverlay_DHS$codebook
y <- M_DHS[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_DHS <- xPolarDot(df, parallel=T, signature=F)
gp_DHS + labs(y="Pearson correlation") + ylim(0,1)
Calculate gene-centric TF association scores:
anno_gr <- TF_gr_Gm12878
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_Gm12878 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
The TF map is obtained by overlaying the TF data in Gm12878 onto the trained TF map in K562:
ind <- match(rownames(G2TF_K562), rownames(G2TF_Gm12878))
additional <- G2TF_Gm12878[ind,]
additional[is.na(additional)] <- 0
sOverlay_Gm12878 <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)
Rankings of 51 TFs in terms of similarity (Pearson correlation) in two cell lines (K562 and Gm12878):
M_K562 <- sMap$codebook
M_Gm12878 <- sOverlay_Gm12878$codebook
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
y <- M_Gm12878[,i]
names(y) <- 1:nrow(M_Gm12878)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_twin <- xPolarDot(df, parallel=T, signature=F)
gp_twin + labs(y="Pearson correlation") + ylim(0,1)
Visualise TF map (reordered by correlation):
## yMap
yMap <- sMap
ind <- match(df$TF, colnames(yMap$codebook))
yMap$codebook <- yMap$codebook[,ind]
## zMap
zMap <- sOverlay_Gm12878
ind <- match(df$TF, colnames(zMap$codebook))
zMap$codebook <- zMap$codebook[,ind]
## xMap
xMap <- sMap
nr <- nrow(xMap$codebook)
nc <- ncol(xMap$codebook)
matrix.combined <- matrix(NA, nrow=nr, ncol=nc*2)
matrix.combined[, seq(1,nc*2,2)] <- yMap$codebook
matrix.combined[, seq(2,nc*2,2)] <- zMap$codebook
xMap$codebook <- matrix.combined
colnames(xMap$codebook) <- rep(colnames(yMap$codebook), each=2)
## visualisation
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=1:(nc*2), rect.grid=c(13,8), title.rotate=0, title.xy=c(0.3,1), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.4), newpage=F)
Perform enrichment analysis for TFs:
## TFs (high correlation)
data <- subset(df, rank<=25)$TF
eTerm_high <- xEnricherGenes(data=data, ontology="SF", ontology.algorithm="none", RData.location=RData.location)
## TFs (low correlation)
data <- subset(df, rank>=27)$TF
eTerm_low <- xEnricherGenes(data=data, ontology="SF", ontology.algorithm="none", RData.location=RData.location)
## visualisation
list_eTerm <- list(eTerm_low, eTerm_high)
names(list_eTerm) <- c('TFs (low correlation)', 'TFs (high correlation)')
bp <- xEnrichCompare(list_eTerm, displayBy="fc", bar.label.size=2.5, signature=F)
bp + theme(axis.text.y=element_text(size=10))
Enriched domains for TFs (high correlation):
xEnrichViewer(eTerm_high, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> 46785 "Winged helix" DNA-binding domain 212 6 17.40 9.72
> 47459 HLH, helix-loop-helix DNA-binding domain 113 3 16.30 6.61
> 57667 beta-beta-alpha zinc fingers 737 6 5.01 4.52
> pvalue adjp distance members
> 46785 2.8e-08 8.3e-08 a.4.5 E2F4, ELF1, ELK1, ETS1, RAD21, SPI1
> 47459 3.0e-05 4.4e-05 a.38.1 BHLHE40, USF1, USF2
> 57667 1.1e-04 1.1e-04 g.37.1 CTCF, EGR1, MAZ, YY1, ZBTB33, ZNF143
Enriched domains for TFs (low correlation):
xEnrichViewer(eTerm_low, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> 57959 Leucine zipper domain 52 5 59.2 17.00
> 47459 HLH, helix-loop-helix DNA-binding domain 113 3 16.3 6.61
> 57667 beta-beta-alpha zinc fingers 737 3 2.5 1.70
> pvalue adjp distance members
> 57959 1.6e-10 4.9e-10 h.1.3 ATF3, CEBPB, FOS, JUND, NFE2
> 47459 3.0e-05 4.4e-05 a.38.1 MAX, MXI1, MYC
> 57667 2.9e-02 2.9e-02 g.37.1 REST, SP1, ZNF274
In this showcase, we analyse TF datasets in K562, Gm12878 and H1hESC.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF datasets shared by K562, Gm12878 and H1hESC:
# extract TFBSs, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
## common to H1-hESC, Gm12878 and K562
anno_gr_H1hESC <- ENCODE_TFBS_ClusteredV3_CellTypes$'H1-hESC'
names_common <- sort(base::Reduce(intersect, list(names(anno_gr_Gm12878), names(anno_gr_K562), names(anno_gr_H1hESC))))
ind <- match(names_common, names(anno_gr_K562))
TF_gr_K562_tri <- anno_gr_K562[ind]
ind <- match(names_common, names(anno_gr_Gm12878))
TF_gr_Gm12878_tri <- anno_gr_Gm12878[ind]
ind <- match(names_common, names(anno_gr_H1hESC))
TF_gr_H1hESC_tri <- anno_gr_H1hESC[ind]
Calculate gene-centric TF association scores:
## G2TF_K562_tri
anno_gr <- TF_gr_K562_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_K562_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
## G2TF_Gm12878_tri
anno_gr <- TF_gr_Gm12878_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_Gm12878_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
## G2TF_H1hESC_tri
anno_gr <- TF_gr_H1hESC_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_H1hESC_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
Train the bridge-shaped map using cross-TF gene scores in K562:
data <- G2TF_K562_tri
sMap_tri <- sPipeline(data, shape="bridge", finetuneSustain=T)
TF map in Gm12878 (overlaid onto the trained TF map in K562):
ind <- match(rownames(G2TF_K562_tri), rownames(G2TF_Gm12878_tri))
additional <- G2TF_Gm12878_tri[ind,]
additional[is.na(additional)] <- 0
sOverlay_Gm12878_tri <- sMapOverlay(sMap_tri, data=G2TF_K562_tri, additional=additional)
TF map in H1hESC (overlaid onto the trained TF map in K562):
ind <- match(rownames(G2TF_K562_tri), rownames(G2TF_H1hESC_tri))
additional <- G2TF_H1hESC_tri[ind,]
additional[is.na(additional)] <- 0
sOverlay_H1hESC_tri <- sMapOverlay(sMap_tri, data=G2TF_K562_tri, additional=additional)
Correlation analysis of TF’s similarity between Gm12878 and H1hESC in terms of K562:
M_K562 <- sMap_tri$codebook
M_Gm12878 <- sOverlay_Gm12878_tri$codebook
M_H1hESC <- sOverlay_H1hESC_tri$codebook
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
## Gm12878
y <- M_Gm12878[,i]
names(y) <- 1:nrow(M_Gm12878)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary1 <- ls_res$df_summary
## H1hESC
y <- M_H1hESC[,i]
names(y) <- 1:nrow(M_H1hESC)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary2 <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], Gm12878=df_summary1$cor, H1hESC=df_summary2$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
## correlation plot
x <- df[,c('TF','Gm12878')]
y <- df$H1hESC
names(y) <- df$TF
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal", plot=T)
gp_tri <- ls_res$ls_gp[[1]]
gp_tri <- gp_tri + labs(x="Correlation between Gm12878 and K562", y="Correlation between H1hESC and K562") + xlim(0.5,1) + ylim(0.5,1)
gp_tri <- gp_tri + geom_abline(intercept=0, slope=1, color="gray50") + coord_fixed(ratio=1)
gp_tri <- gp_tri + ggrepel::geom_text_repel(data=gp_tri$data, aes(label=name), size=2, fontface='bold', box.padding=unit(0.2,"lines"), point.padding=unit(0.2,"lines"), segment.color='grey50', segment.alpha=0.5, arrow=arrow(length=unit(0.01,'npc')), force=0.1)
gp_tri
Visualise TF map (reordered by correlation difference):
## order by the difference
df$diff <- df$H1hESC - df$Gm12878
df <- df %>% dplyr::arrange(diff)
## xMap
xMap <- sMap_tri
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
## yMap
yMap <- sOverlay_Gm12878_tri
ind <- match(df$TF, colnames(yMap$codebook))
yMap$codebook <- yMap$codebook[,ind]
## zMap
zMap <- sOverlay_H1hESC_tri
ind <- match(df$TF, colnames(zMap$codebook))
zMap$codebook <- zMap$codebook[,ind]
## aMap
aMap <- sMap_tri
nr <- nrow(xMap$codebook)
nc <- ncol(xMap$codebook)
matrix.combined <- matrix(NA, nrow=nr, ncol=nc*3)
### ordered by H1hESC Gm12878 K562
matrix.combined[, seq(1,nc*3,3)] <- zMap$codebook
matrix.combined[, seq(2,nc*3,3)] <- yMap$codebook
matrix.combined[, seq(3,nc*3,3)] <- xMap$codebook
aMap$codebook <- matrix.combined
colnames(aMap$codebook) <- rep(colnames((xMap$codebook)), each=3)
## visualisation
colormap <- xColormap("jet.top")
### reverse order by K562 Gm12878 H1hESC
visHexMulComp(aMap, which.components=(nc*3):1, rect.grid=c(9,9), title.rotate=0, title.xy=c(0.3,1), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.7), newpage=F)
In this showcase, we analyse TF and CRISPR datasets in K562.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF and CRISPR datasets in K562:
# extract TF, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
# calculate gene-centric TF association scores
anno_gr <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
mat <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
# import CRISPR
data.file <- file.path(RData.location,'TW_Science_CRISPR.txt')
data <- read.delim(data.file, header=TRUE, stringsAsFactors=FALSE)
data <- subset(data, K562.adjusted.p.value<0.05)
# extract genes commons to CRISPR and TF datasets in K562
ind <- match(data$Gene, rownames(mat))
G2TF_K562_crispr <- mat[ind[!is.na(ind)],]
G2CRISPR_K562 <- data.frame(CRISPR=data$K562.CS[!is.na(ind)], stringsAsFactors=F)
rownames(G2CRISPR_K562) <- data$Gene[!is.na(ind)]
Justify the use of the triangle-shaped map:
data <- G2TF_K562_crispr
df <- stats::prcomp(data)$x[,1:2]
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + geom_point(size=1,col='transparent') + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=24) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the triangle-shaped map using cross-TF gene scores:
data <- G2TF_K562_crispr
sM <- sPipeline(data, shape="triangle", xdim=12, finetuneSustain=T)
The CRISPR map is obtained by overlaying the CRISPR data onto the trained TF map in K562:
sOverlay_CRISPR <- sMapOverlay(sM, data=G2TF_K562_crispr, additional=G2CRISPR_K562)
Visualise CRISPR map:
colormap <- xColormap("jet.both")
visHexMulComp(sOverlay_CRISPR, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(-4,4), gp=grid::gpar(cex=1), newpage=F)
Rankings of 100 TFs in terms of similarity (Pearson correlation) to the CRISPR map in K562:
M_K562 <- sM$codebook
M_CRISPR <- sOverlay_CRISPR$codebook
y <- M_CRISPR[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_CRISPR <- xPolarDot(df, parallel=T, signature=F)
gp_CRISPR + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))
Visualise TF map (reordered by correlation):
xMap <- sM
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=100:1, rect.grid=c(10,10), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Obtain gene clusters based on TF map:
xM <- sM
ind <- match(df$TF, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)
Calculate CRISPR scores per gene cluster:
colormap <- xColormap("jet.top")
output <- visDmatHeatmap(xM, data=G2TF_K562_crispr[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,1), KeyValueName="TF-gene association", labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)
## Polar barplot
ind <- match(output$ID, rownames(G2CRISPR_K562))
output$val <- G2CRISPR_K562[ind,]
ls_tmp <- split(x=output$val, f=output$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
x <- ls_tmp[[i]]
y <- median(x)
data.frame(Meta=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp
Perform enrichment analysis for clustered genes:
ls_cluster <- split(x=output$ID, f=output$Cluster_base)
ontologies <- c("SF","GOMF","REACTOME")
background <- output$ID
FDR.cutoff <- 0.01
ls_mat <- lapply(1:length(ontologies), function(k){
ontology <- ontologies[k]
message(sprintf("Analysing '%s' (%s) ...", ontology, as.character(Sys.time())), appendLF=T)
ls_df <- lapply(1:length(ls_cluster), function(i){
message(sprintf("\t'%s' %d (%s) ...", ontology, i, as.character(Sys.time())), appendLF=T)
eTerm <- xEnricherGenes(data=ls_cluster[[i]], ontology=ontology, background=background, size.range=c(10,2000), test="fisher", min.overlap=10, p.tail="one-tail", RData.location=RData.location, verbose=F, silent=T)
df <- xEnrichViewer(eTerm, top_num="all", sortBy="none", details=T)
if(is.null(df)){
return(NULL)
}else{
df$namespace <- rep(ontology, nrow(df))
cbind(cluster=rep(paste0('C',names(ls_cluster)[i]),nrow(df)), id=rownames(df), df, stringsAsFactors=F)
}
})
df_all <- do.call(rbind, ls_df)
ind <- which(df_all$adjp < FDR.cutoff)
if(length(ind)>=2){
df <- as.data.frame(df_all %>% dplyr::filter(adjp < FDR.cutoff))
}else{
return(NULL)
}
mat_fdr <- as.matrix(xSparseMatrix(df[,c('name','cluster','adjp')], rows=unique(df$name), columns=paste0('C',1:length(ls_cluster))))
mat_fdr[mat_fdr==0] <- NA
mat_fdr <- -log10(mat_fdr)
mat_fdr <- mat_fdr[order(-nchar(rownames(mat_fdr))), ]
as.data.frame(mat_fdr)
})
mat <- do.call(rbind, ls_mat)
gp <- xHeatmap(mat, reorder="none", colormap='skyblue-darkorange', ncolors=64, zlim=c(2,8), legend.title="Log10(FDR)\n", barwidth=0.4, x.rotate=60, shape=19, size=2, x.text.size=6,y.text.size=6, na.color='transparent',barheight=5)
gp + theme(legend.title=element_text(size=8))